package com.novel.crawl.bxwx9.spider.processor;

import com.novel.crawl.common.entity.Book;
import com.novel.crawl.common.entity.Chapter;
import com.novel.crawl.common.entity.Content;
import com.novel.crawl.common.service.BookService;
import com.novel.crawl.common.service.ChapterService;
import com.novel.crawl.common.service.ContentService;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.util.CollectionUtils;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Request;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;
import us.codecraft.webmagic.selector.Selectable;

import java.util.Comparator;
import java.util.List;
import java.util.stream.Collectors;

/**
 * 书籍解析
 *
 * @author 奔波儿灞
 * @since 1.0
 */
public class NovelProcessor implements PageProcessor {

    private static final Logger LOG = LoggerFactory.getLogger(NovelProcessor.class);

    public static final String ITEM_BOOK = "ITEM_BOOK";

    private static final String ITEM_CHAPTER_ID = "ITEM_CHAPTER_ID";

    private static final String HTML_SUFFIX = ".html";
    private static final String INDEX_HTML = "index.html";

    private final BookService bookService;
    private final ChapterService chapterService;
    private final ContentService contentService;

    public NovelProcessor(BookService bookService, ChapterService chapterService, ContentService contentService) {
        this.bookService = bookService;
        this.chapterService = chapterService;
        this.contentService = contentService;
    }

    /**
     * 站点设置
     */
    private final Site site = Site.me()
                                .setCharset("UTF-8")
                                .setRetryTimes(3)
                                .setCycleRetryTimes(3)
                                .setSleepTime(1000);

    @Override
    public void process(Page page) {
        String currentUrl = page.getRequest().getUrl();
        if (StringUtils.endsWith(currentUrl, INDEX_HTML)) {
            // 章节页
            processChapter(page);
        } else {
            // 内容页
            processContent(page);
        }
    }

    /**
     * 解析章节
     *
     * @param page Page
     */
    private void processChapter(Page page) {
        final Book book = (Book) page.getRequest().getExtras().get(ITEM_BOOK);
        final String bookId = book.getId();
        final String bookUrl = page.getRequest().getUrl();

        // 解析章节
        List<Chapter> chapters = page.getHtml()
                // 找到对应的章节a标签
                .xpath("//*[@id='TabCss']/dl/dd/a")
                .nodes()
                .stream()
                // 转成章节对象
                .map(selectable -> parseChapter(bookId, bookUrl, selectable))
                // 章节排序
                .sorted(Comparator.comparing(Chapter::getSortedValue))
                .collect(Collectors.toList());
        LOG.debug("parse novel, bookId: {}, chapters num: {}", bookId, chapters.size());

        // 新的章节
        List<Chapter> newChapters;
        String newChapterId = book.getNewChapterId();
        if (StringUtils.isEmpty(newChapterId)) {
            newChapters = chapters;
        } else {
            Chapter newChapter = chapterService.findById(newChapterId);
            if (newChapter == null) {
                newChapters = chapters;
            } else {
                newChapters = chapters.stream()
                        .filter(chapter -> chapter.getSortedValue() > newChapter.getSortedValue())
                        .collect(Collectors.toList());
            }
        }

        if (CollectionUtils.isEmpty(newChapters)) {
            LOG.debug("no new chapters, bookId: {}", bookId);
        } else {
            // 数据直接保存，不走pipeline
            LOG.debug("save new chapters, bookId: {}, chapter num: {}", bookId, newChapters.size());
            chapterService.batchAdd(newChapters);

            // 更新book的最新章节
            Chapter chapter = newChapters.get(newChapters.size() - 1);
            book.setNewChapterId(chapter.getId());
            book.setNewChapterName(chapter.getTitle());
            bookService.modify(book);
            LOG.debug("modify book' new chapter, bookId: {}, chapterId: {}", bookId, chapter.getId());

            // 新章节内容继续解析，将chapterId绑定在请求中
            newChapters.stream()
                    .map(this::toRequest)
                    .forEach(page::addTargetRequest);
        }
    }

    /**
     * 解析内容
     *
     * @param page Page
     */
    private void processContent(Page page) {
        final String chapterId = (String) page.getRequest().getExtras().get(ITEM_CHAPTER_ID);
        String content = page.getHtml()
                .xpath("//div[@id='content']/html()")
                .get()
                .replace("<div id=\"adright\"></div>", StringUtils.EMPTY)
                .replaceAll("\n", StringUtils.EMPTY);
        // 数据直接保存，不走pipeline
        LOG.debug("save new content, chapterId: {}", chapterId);
        contentService.add(new Content(chapterId, content));
    }

    private Chapter parseChapter(String bookId, String bookUrl, Selectable selectable) {
        String title = selectable.xpath("/a/text()").get();
        String relativeUrl = selectable.xpath("/a/@href").get();
        String url = bookUrl.replace(INDEX_HTML, relativeUrl);
        Long sortedValue = Long.valueOf(relativeUrl.replace(HTML_SUFFIX, StringUtils.EMPTY));
        return new Chapter(bookId, title, url, relativeUrl, sortedValue);
    }

    private Request toRequest(Chapter chapter) {
        String chapterId = chapter.getId();
        String url = chapter.getUrl();
        Request request = new Request(url);
        request.putExtra(ITEM_CHAPTER_ID, chapterId);
        return request;
    }

    @Override
    public Site getSite() {
        return site;
    }

}
